\documentclass[final]{siamart171218}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{bm}

\setlength{\oddsidemargin}{0.65in}
\setlength{\evensidemargin}{0.65in}

\title{Differential count analysis with a topic model}

\author{Peter Carbonetto\thanks{Dept. of Human Genetics and the Research Computing Center, University of Chicago, Chicago, IL}}

\begin{document}

\maketitle

\section{Motivation, and overview of methods}

The aim of this document is to derive, from first principles, a method
for analysis of differential gene expression using a topic model (also
known as ``grade of membership model'' \cite{dey-2017}). This method
may have other uses --- say, to identify ``key words'' in a topic
modeling analysis of text documents --- but since our main motivation
is analysis of gene expression data, we describe the methods with that
application in mind.

Topic modeling brings a new twist to analysis of differential gene
expression: In conventional differential expression analysis, each
sample is assigned to a single condition; in topic modeling, the
assignments to each topic are {\em proportional.} This needs to be
accounted for in developing the new methods for differential
expression analysis.

For motivation, we begin with the ``log-fold change'' statistic
commonly used in microarray and RNA sequencing experiments to quantify
expression differences between two conditions (e.g.,
\cite{cui-churchill-2003, quackenbush-2002}). The log-fold change for
gene $j$ and condition $k$ is a ratio of two conditional expectations,
\begin{equation}
\beta_{jk} \equiv
\log_2 \frac{E[\,x_j \,|\, \mathrm{condition} = k\,]}
            {E[\,x_j \,|\, \mathrm{condition} \neq k\,]},
\label{eq:lfc}
\end{equation}
where $x_j$ is the measured expression level of gene
$j$.\footnote{Defining $x_{jk}$ as the total gene expression for gene
  $j$ among all samples (expression profiles) in condition $k$, $x_j$
  as the total gene expression for gene $j$ in all samples, $n_k$ as
  the number of samples in condition $k$, and $n$ as the total sample
  size, the log-fold change can be computed as $\beta_{jk} = \log_2
  \Big\{ \frac{x_{jk}}{x_j - x_{jk}} \times \frac{n - n_k}{n_k}
  \Big\}$.} The way in which the expression level $x_j$ is defined can
lead to different log-fold change statistics. For example, several
popular methods for analyzing differential expression compare changes
in {\em relative} expression (say, relative to total expression in a
cell) \cite{bullard-2010, voom, DESeq2, edgeR}. As we will see, a
topic modeling perspective provides a natural way to analyze
differential gene expression when comparing either relative or
absolute expression levels.

\section{A binomial model}

To develop the methods, we begin with a simple binomial model that
predicts expression of a single gene given the proportional
assignments to a topic:
\begin{equation}
x_i \sim \mathrm{Binom}(s_i, \pi_i).
\label{eq:binomial}
\end{equation}
Here, $x_i$ is the expression level of the target gene in sample $i$,
$s_i$ is the total expression in sample $i$, and $\mathrm{Binom}(n,
\theta)$ denotes the binomial distribution with $n$ trials and success
probability $\theta$. In this simple model, the binomial probabilities
are defined as
\begin{equation}
\pi_i = (1 - q_i) p_0 + q_i p_1,
\label{eq:binomial-prob}
\end{equation}
where $q_i \in [0,1]$ is the (known) proportion of sample $i$ that is
attributed to the topic, and $p_0, p_1 \in [0, 1]$ are two unknowns to
be estimated.\footnote{This is a special case of the {\em linear
    probability model} in econometrics called \cite{stock-watson}. The
  linear probability model would be typically written as $\pi_i = b_0
  + q_i b$, where $b_0 = p_0$ and $n = p_1 - p_0 \in [-1, +1]$. This
  is a regression model for the binomial probability $\pi_i$, in which
  $\pi_i$ increases linearly with topic proportion $q_i$.}

Statistical inference with this simple binomial model implements
analysis of differential gene expression. In particular, $\beta \equiv
\log_2(p_1/p_0)$ is the {\em log-fold change in relative expression.}
To show that this is so, consider the following statistical process
for generating the counts $x_1, \ldots, x_n$:
\vspace{1em}
\begin{itemize}

\item for $i = 1, \ldots, n$
\begin{enumerate}

  \item for $t = 1, \ldots, s_i$
  \begin{enumerate}

  \item Sample a topic, $z_{it} \sim \mathrm{Binom}(1,q_i)$.

  \item Sample a gene, $w_{it} \,|\, z_{it} \sim \left\{\begin{array}{ll}
  \mathrm{Binom}(1,p_1) & \mbox{if $z_{it} = 1$} \\
  \mathrm{Binom}(1,p_0) & \mbox{otherwise.}
  \end{array}\right.$

  \end{enumerate}

  \item Generate the final gene count, 
  $x_i \leftarrow w_{i1} + \cdots + w_{is_i}$.

\end{enumerate}
\end{itemize}
\vspace{1em} In this statistical process, $q_i = p(z_{it} = 1)$ is the
topic probability, $p_1 = p(w_{it} = 1 \,|\, z_{it} = 1)$ is the
conditional probability that the gene is expressed given membership to
the topic, and $p_0 = p(w_{it} = 1 \,|\, z_{it} = 0)$ is the
probability that the gene is expressed when not belonging to the
topic. Therefore, we have
\begin{equation}
\beta \equiv \log_2 \frac{p_1}{p_0}  
= \log_2 \frac{p(w_{it} = 1 \,|\, z_{it} = 1)}
            {p(w_{it} = 1 \,|\, z_{it} = 0)}.
\label{eq:lfc-binom}
\end{equation}
{\em This is the log-fold change statistic for relative expression
  given proportional topic assignments $q_1, \ldots, q_n$.} The
binomial model \eqref{eq:binomial} can in fact be derived from this
statistical process (proof not shown). Therefore, estimating $p_0,
p_1$ for the binomial model \eqref{eq:binomial} provides an estimate
of the log-fold change \eqref{eq:lfc-binom}.

Before continuing, we point out that the $s_i$'s, in practice, do not
need to be the total expression for each sample $i$ ({\em i.e.}, the
total counts); for example, some researchers have suggested setting
$s_i$ to some pre-defined quantile of the sample's nonzero count
distribution (e.g., \cite{bullard-2010}). This approach has been
motivated in settings where a small number of genes are much more
highly expressed than the others, and therefore these few genes have a
large effect relative expression levels. In short, the binomial model
is quite flexible, and can accommodate different relative differential
expression analyses defines the $s_i$'s. The only constraint on $s_i$
is that it cannot be smaller than $x_i$.

\section{A Poisson model}

The Poisson likelihood with rate $\lambda = n\theta$ closely
approximates the Binomial likelihood $\mathrm{Binom}(n, \theta)$ when
$\theta$ is small and $n$ is large. (The Poisson arises as the
limiting distribution of the binomial as $n \rightarrow \infty, \pi
\rightarrow 0, n\theta \rightarrow \lambda$.) This is usually the case in
gene expression studies; the total gene expression is large, whereas
the contribution of each gene is small, even for genes with the
highest levels of expression. This suggests a Poisson model of
expression $x_i$ given topic proportions $q_i$:
\begin{equation}
x_i \sim \mathrm{Poisson}(t_i \lambda_i),
\label{eq:poisson}
\end{equation}
in which the Poisson rates are defined as
\begin{equation}
\lambda_i = (1 - q_i) f_0 + q_i f_1,
\end{equation}
and the two unknowns to be estimated are $f_0, f_1 \geq 0$. Observe
that, unlike the binomial model, the unknowns for the Poisson model
are not constrained to be in $[0, 1]$, and so they do not necessarily
represent probabilities. However, by setting $t_i = s_i$, we have that
\begin{align*}
f_0 &\approx p_0 \\
f_1 &\approx p_1,
\end{align*}
when $p_0, p_1$ are close to zero and all $s_i$ are large. (See the
Appendix for an alternative motivation of the Poisson model.)

In the next section, we derive the mathematical expressions needed to
implement differential count analysis based on the Poisson model.

\section{Poisson model derivations}

To derive an algorithm for computing MLEs of $f_0, f_1$ in the Poisson
model \eqref{eq:poisson}, we begin with the log-likelihood,
\begin{equation}
\ell(f_0, f_1) \equiv \log p(x \,|\, f_0, f_1) = 
\sum_{i=1}^n x_i \log (t_i \lambda_i) - t_i \lambda_i +
\mbox{const,}
\label{eq:loglik-poisson}
\end{equation}
in which the ``const'' captures all terms that do not depend on $f_0$
or $f_1$. A useful identity is that the partial derivative of the
log-likelihood with respect to $\lambda_i$ is
\begin{equation*}
\frac{\partial\ell}{\partial\lambda_i} = \frac{x_i}{\lambda_i} - t_i.
\end{equation*}
Making use of this result, the partial derivatives of the
log-likelihood with respect to the model parameters $f_0$ and $f_1$
are
\begin{align}
\frac{\partial\ell}{f_0} &= 
\sum_{i=1}^n (x_i/\lambda_i - t_i) \times (1-q_i) \\
\frac{\partial\ell}{f_1} &= 
\sum_{i=1}^n (x_i/\lambda_i - t_i) \times q_i.
\end{align}
We use a quasi-Newton method implemented in the R function {\tt optim}
to minimize the negative log-likelihood
\eqref{eq:loglik-poisson}. Note that in the special case in which all
the topic proportions are either 0 or 1, the MLEs have a simple
closed-form solution:
\begin{align}
f_0 &= \frac{\sum_{i=1}^n (1 - q_i) x_i}{\sum_{i=1}^n (1 - q_i) t_i} \\
f_1 &= \frac{\sum_{i=1}^n q_i x_i}{\sum_{i=1}^n q_i t_i} 
\end{align}

\subsection{EM for Poisson model} 

We also implement a simple EM algorithm for fitting the Poison model
\eqref{eq:poisson}. The key is to introduce latent variables $a_i \sim
\mathrm{Poisson}(t_i (1-q_i) f_0)$ and $b_i \sim \mathrm{Poisson}(t_i
q_i f_1)$, then work with the expected complete log-likelihood
$E[\,\log p(x, a, b \,|\, f_0, f_1)\,]$. The ``M-step'' updates work
out to
\begin{align}
f_0 &= \frac{\sum_{i=1}^n \phi_i}{\sum_{i=1}^n t_i(1-q_i)} \\
f_1 &= \frac{\sum_{i=1}^n \gamma_i}{\sum_{i=1}^n t_i q_i},
\end{align}
where we have introduced notation for the posterior expectations of
the latent variables, $\phi_i \equiv E[a_i]$ and $\gamma_i \equiv
E[b_i]$. It can be shown that the posterior distribution of $(a_i,
b_i)$ is multinomial with number of trials $x_i$ and event
probabilities proportional to $(1-q_i) f_0$ and $q_i f_1$. So the
posterior expectations computed in the ``E-step'' are
\begin{align}
\phi_i   &= x_i (1 - q_i) f_0 / \lambda_i \\
\gamma_i &= x_i q_i f_1 / \lambda_i.
\end{align}
This completes the description of the EM algorithm for the Poisson
model.

\subsection{glm identity parameterization}

Once we have obtained MLEs of $f_0$ and $f_1$, we would also like to
characterize uncertainty in these estimates---that is, compute the
standard error (s.e.). In particular, we are interested in the s.e. of
the log-fold change statistic, $\beta$. As an intermediate step, we
consider the ``glm identity'' parameterization $\lambda_i = b_0 + q_i
b$, where $b = f_1 - f_0$ and $b_0 = f_0$. This parameterization can
be implemented using {\tt glm} in R with {\tt family = poisson(link =
  "identity")}, and therefore can be used to verify our calculations.

Under the Laplace approximation to the likelihood at the MLE
$(\hat{b}_0, \hat{b})$, the covariance matrix is $-H^{-1}$, where $H$
is the $2 \times 2$ matrix of second-order partial derivatives,
\begin{equation*}
\frac{\partial^2\ell}{\partial b_0^2} = 
-\sum_{i=1}^n \frac{x_i}{\lambda_i^2}, \qquad
\frac{\partial^2\ell}{\partial b^2} = 
-\sum_{i=1}^n \frac{x_i q_i^2}{\lambda_i^2}, \qquad
\frac{\partial^2\ell}{\partial b_0 \partial b} = 
-\sum_{i=1}^n \frac{x_i q_i}{\lambda_i^2}.
\end{equation*}
This result can be used to obtain the standard errors and $z$-scores
for $\hat{b}_0$ and $\hat{b}$.

\subsection{Log-fold change parameterization}

Here we derive the s.e. for the MLE of $\beta \equiv \log
(f_1/f_0)$. (For convenience, we use the natural logarithm here rather
than the base-2 logarithm; to obtain the base-2 log-fold change
statistic and its s.e., divide by $\log 2$.) With this new
parameterization, the Poisson rates are
\begin{equation}
\lambda_i = f_0 \times \{1 - q_i(1-e^{\beta})\}
\end{equation}
The second-order partial derivatives needed to compute the $2 \times
2$ Hessian are
\begin{align}
\frac{\partial\ell}{f_0} &= \frac{1}{f_0} \sum_{i=1}^n 
x_i - t_i\lambda_i \\
\frac{\partial\ell}{\beta} &= f_1 \sum_{i=1}^n 
(x_i/\lambda_i - t_i) \times q_i \\
\frac{\partial^2\ell}{f_0^2} &=
-\frac{1}{f_0^2} \sum_{i=1}^n x_i \\
\frac{\partial^2\ell}{\beta^2} &= 
-f_1 \sum_{i=1}^n t_i q_i - x_i f_0 q_i (1-q_i)/\lambda_i^2 \\
\frac{\partial^2\ell}{\partial f_0 \partial\beta} &= 
-\frac{f_1}{f_0} \sum_{i=1}^n t_i q_i.
\end{align}
The expression for $\frac{\partial^2\ell}{\partial f_0 \partial\beta}$
is the more complicated one, but fortuately it simplifies at the MLE,
$\beta = \hat{\beta}$ (at the MLE, the gradient of the log-likelihood
with respect to $\beta$ vanishes):
\begin{equation}
\frac{\partial^2\ell}{\beta^2} = -f_1^2 \sum_{i=1}^n x_i (q_i/\lambda_i)^2.
\end{equation}
Therefore, at the MLE $\beta = \hat{\beta}$, the standard error is
\begin{equation}
\mathrm{se}(\hat{\beta}) = \frac{1}{f_1} \times 
\sqrt{\frac{\bar{a}}{\bar{a} \times \bar{c} - \bar{b}^2}},
\end{equation}
where I've defined
\begin{equation*}
\bar{a} = \sum_{i=1}^n x_i, \qquad
\bar{b} = \sum_{i=1}^n t_i q_i, \qquad
\bar{c} = \sum_{i=1}^n x_i (q_i/\lambda_i)^2.
\end{equation*}
From this, the $z$-score is recovered as $z =
\hat{\beta}/se(\hat{\beta})$.

\appendix

\section{More on Poisson model}

Consider the following process for generating the counts $x_1, \ldots,
x_n$:
\begin{itemize}

\item for $i = 1, \ldots, n$
\begin{enumerate}

\item $a_i \sim \mathrm{Poisson}(f_1)$ 
\hfill Sample the within-topic gene count. \hspace{2em}

\item $b_i \sim \mathrm{Poisson}(f_0)$ 
\hfill Sample the outside-topic gene count. \hspace{2em}

\item $a_i' \sim \mathrm{Binom}(a_i, q_i)$ 
\hfill Subsample the within-topic genes. \hspace{2em}

\item $b_i' \sim \mathrm{Binom}(b_i, 1-q_i)$ 
\hfill Subsample the outside-topic genes. \hspace{2em}

\item $x_i \leftarrow a_i' + b_i'$ 
\hfill Generate the final gene count. \hspace{2em}

\end{enumerate}
\end{itemize}
In this generative process, $f_1 = E[a_i]$ represents the
within-topic gene rate, and $f_0 = E[b_i]$ is the outside-topic gene
rate, and therefore 
\begin{equation}
\beta^{\mathsf{abs}} \equiv 
\log_2 \frac{f_1}{f_0} = \log_2 \frac{E[a_i]}{E[b_i]}
\label{eq:lfc-poisson}
\end{equation}
is the {\em log-fold change statistic in (absolute) expression given
  proportional topic assignments $q_1, \ldots, q_n$.} For intuition,
when all the topic proportions $q_i$ are 0 or 1, this statistical
process simplify to $x_i \sim \mathrm{Poisson}(f_1)$ if $q_i = 1$, and
$x_i \sim \mathrm{Poisson}(f_0)$ if $q_i = 0$, and so
\eqref{eq:lfc-poisson} would reduce to the ratio of the mean
expression levels inside and outside the topic. The Poisson model
\eqref{eq:poisson} can be derived from this statistical process, and
so estimating $f_0, f_1$ for the Poisson model \eqref{eq:poisson}
provides an estimate of the log-fold change \eqref{eq:lfc-poisson}.

\bibliographystyle{siamplain}
\bibliography{diffcount}

\end{document}

